from sklearn.utils import Bunch
from tools import *
import jieba

def make_Bunch(wordbag_path,txt_path):
    ##创建一个bunch实例
    bunch = Bunch(target_name=[],label=[],filenames=[],contents=[])

    f = open(txt_path,'rb')
    data = f.readlines()
    f.close()
    for line in data:   ###遍历每一行
        line_content = line.decode().split('\t')  ###解码分割 ['14744', 'news_agriculture', '83岁老农民一辈子勤练书法，楷书一流！堪称大家风范！']
        tar_name = line_content[1]
        content = line_content[2].encode()  ####str->bytes
        
        '''下面进行去噪、去冗杂、分词'''
        content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip()  # 删除换行
        content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip()  # 删除空行、多余的空格
        content_seg = jieba.lcut(content,cut_all = False)  # 为文件内容分词  精确模式，已经去冗杂
        content_end = ''.join(content_seg)

        if tar_name not in bunch.target_name:
            bunch.target_name.append(tar_name)   ###加入类别，只是一个大的分类 13种
        bunch.label.append(tar_name)  ###加入每一条的类别
        bunch.filenames.append(txt_path)
        bunch.contents.append(content_end)

    writebunchobj(wordbag_path,bunch) # 将bunch存储到wordbag_path路径中
